# Imports
import shap
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from operator import itemgetter
shap.initjs()
# Read data
data = pd.read_csv('Dataset-Unicauca-Version2-87Atts-Clean.csv')
# Rename Labels to DataFlow from previous processing that bucketed data usage
data.rename({"labels": "DataFlow"}, axis=1, inplace=True)
# Label Protocol Names to be used as targets for Random Forest Model
lb_make = LabelEncoder()
data["labels"] = lb_make.fit_transform(data["ProtocolName"])
data[["ProtocolName", "labels"]].head(11)
mapped_label_ids = dict(zip(lb_make.classes_,range(len(lb_make.classes_))))
# Drop columns not needed for modelling
data.drop(columns=["Flow.ID"], inplace=True)
data.drop(columns=["ProtocolName"], inplace=True)
data.drop(columns=["L7Protocol"], inplace=True)
# Split Data to train/test
y = data.iloc[:, -1]
data.drop(columns=data.columns[-1], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
# Prepare Interactive Predictive Label Selector for Visualizations
import ipywidgets as widgets
# Create the list of all labels for the drop down list
list_of_labels = y.unique().tolist()
# Create a list of tuples so that the index of the label is what is returned
tuple_of_labels = list(zip(list_of_labels, range(len(list_of_labels))))
# Create a widget for the labels and then display the widget
current_label = widgets.Dropdown(options=mapped_label_ids,
value=0,
description='Select Label:'
)
# Create a widget for the labels and then display the widget
current_label2 = widgets.Dropdown(options=mapped_label_ids,
value=0,
description='Select Label:'
)
# Function to report accuracy
def print_accuracy(f):
print("Accuracy = {0}%".format(100 * np.sum(f(X_test) == y_test) / len(y_test)))
time.sleep(0.5) # to let the print get out before any progress bars
# Random Forest Explainable (only taking the first 5000 records; the entire dataset takes too long to model)
rforest = RandomForestClassifier(n_estimators=100, max_depth=25, min_samples_split=20, random_state=0)
rforest.fit(X_train.head(5000), y_train.head(5000))
print_accuracy(rforest.predict)
Accuracy = 58.77994576915551%
# GPU accelerated SHAP values
shap_sample = shap.sample(X_test, nsamples=1000)
explainer = shap.explainers.GPUTree(rforest, feature_perturbation="tree_path_dependent")
shap_interaction_values = explainer.shap_interaction_values(shap_sample)
shap_values = explainer.shap_values(shap_sample)
This is a standard violin plot but with outliers drawn as points. This gives a more accurate representation of the density out the outliers than a kernel density estimated from so few points. The color represents the average feature value at that position, so red regions have mostly high valued feature values while blue regions have mostly low feature values.
current_label
Dropdown(description='Select Label:', options={'99TAXI': 0, 'AMAZON': 1, 'APPLE': 2, 'APPLE_ICLOUD': 3, 'APPLE…
shap.summary_plot(shap_values = shap_values[current_label.value],
features = X_test.iloc[0:1000,:],
plot_type="violin")
By changing the label to something else we can see what features on the left have a higher impact on the model predicting this label. For example, when predicing the Protocol Name of "99TAXI" we see the feature L7Protocol has little importance on predicting this Protocol. However, the feature Forward Packet Length does.
current_label
Dropdown(description='Select Label:', index=1, options={'99TAXI': 0, 'AMAZON': 1, 'APPLE': 2, 'APPLE_ICLOUD': …
shap.force_plot(explainer.expected_value[0], shap_values[current_label.value], shap.sample(X_test, nsamples=1000), link='logit')
Again, by changing the predicted label to something else we can see what features are significant and not significant, like from the previous visualization. However, we cal also interact with different features and compairsons from an additional labels on the top and to the left of the visualization. We can see that when both the highlighted features equal a particular value what the other features we are comparing to on top may equal.
# Bar Plot of Absolute Percentage Importance of each Feature
tree_num = 0 # which tree in the forest is being examined
explanation = shap.Explanation(np.abs(shap_values[tree_num] / np.sum(shap_values[tree_num])) * 100, feature_names = X_test.iloc[0:1000,:])
shap.plots.bar(explanation[current_label.value], max_display=25)
This plot shows the proportional impact of each feature using mean absolute value SHAP values. The larger the number, the higher the impact a feature has on the model's categorization.
# See all our predicted values from our X_test dataset.
explainer.expected_value
[0.022943999618291855, 0.0021860001143068075, 0.0007580000092275441, 0.0029180001001805067, 0.0015559999737888575, 0.000615999975707382, 0.007476000115275383, 0.00810800027102232, 0.014423999935388565, 0.26695799827575684, 0.0004239999980200082, 0.192548006772995, 0.08785399794578552, 0.00020599999697878957, 0.17839999496936798, 0.0007859999896027148, 0.00023999999393709004, 0.015666000545024872, 0.0001539999939268455, 0.0037479999009519815, 0.0007980000227689743, 0.00037799999699927866, 0.002687999978661537, 0.0071680000983178616, 0.00041000000783242285, 0.11025600135326385, 0.00017800000205170363, 0.005942000076174736, 0.00020799999765586108, 0.0006300000241026282, 0.0005660000024363399, 0.01066999975591898, 0.006512000225484371, 0.04562599956989288]
# View the features that predicted this label
print("Visualization for predicted label: " + str(int(explainer.expected_value[0])))
shap.force_plot(explainer.expected_value[0], shap_values[32], shap.sample(X_test, nsamples=1000))
Visualization for predicted label: 0
Now interact with the dropdowns at the top to select the feature and see the values of that feature that may predict the label above.
# Get names of columns and importances from the model
cols = list(data.columns)
importances = rforest.feature_importances_
# Map importances to their names
feature_dict = dict(zip(cols, importances))
# Take N largest importances
N = 10
sorted_dict = dict(sorted(feature_dict.items(), key = itemgetter(1), reverse = True)[:N])
sorted_cols = list(sorted_dict.keys())
print(sorted_cols)
sorted_importances = list(sorted_dict.values())
print(sorted_importances)
['Init_Win_bytes_backward', 'Init_Win_bytes_forward', 'min_seg_size_forward', 'Flow.IAT.Max', 'Flow.Duration', 'act_data_pkt_fwd', 'Fwd.Packets.s', 'Flow.Packets.s', 'Fwd.IAT.Max', 'Average.Packet.Size'] [0.05130009037407691, 0.04951396482244519, 0.046412074270445115, 0.029383186410329713, 0.025938456114675365, 0.02396187819056117, 0.023470338215664723, 0.022309228565277353, 0.022027570268548038, 0.021037002705192202]
#plot
fig, ax = plt.subplots()
width = 0.4 # the width of the bars
ind = np.arange(len(sorted_importances)) # the x locations for the groups
ax.barh(ind, sorted_importances, width, color="green")
ax.set_yticks(ind+width/10)
ax.set_yticklabels(sorted_cols, minor=False)
ax.invert_yaxis()
plt.title("Most Important Features in Model")
plt.xlabel("Relative importance")
plt.ylabel("Feature")
plt.figure(figsize=(10,10))
# fig.set_size_inches(6.5, 4.5, forward=True)
<Figure size 720x720 with 0 Axes>
<Figure size 720x720 with 0 Axes>
non_num_cols = [col for col in data.columns if data[col].dtype == 'O']
num_cols = list(set(data.columns) - set(non_num_cols))
corr = data[num_cols].corr()
# Correlation on full dataset
f = plt.figure(figsize = (25,25))
plt.matshow(corr, fignum=f.number)
plt.title('Correlation Matrix of Numeric columns in the dataset', fontsize = 20)
plt.xticks(range(len(num_cols)), num_cols, fontsize = 14, rotation = 90)
plt.yticks(range(len(num_cols)), num_cols, fontsize = 14)
plt.gca().xaxis.set_ticks_position('bottom')
cb = plt.colorbar(fraction = 0.0466, pad = 0.02)
cb.ax.tick_params(labelsize=10)
plt.show()